package com.tianpengtech.common.webmagic;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

public class WebMagicProcessor implements PageProcessor{

	private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
	@Override
	public void process(Page page) {
		
		String content = page.getHtml().css("ul.note-list").get();
		Document doc = Jsoup.parse(content);
		Elements elements =doc.select("li>div");
		for (Element element : elements) {
//			System.out.println(element.attr("href"));
//			System.out.println(element.text());
			System.out.println(site.getDomain()+element.select("a.title").first().attr("href"));
			System.out.println(element.select("a.title").first().text());
			System.out.println(element.select("p.abstract").first().text());
		}
		
	}

	@Override
	public Site getSite() {
		return site;
	}

	
	
	public static void main(String[] args) {

        Spider.create(new WebMagicProcessor())
                //从"https://github.com/code4craft"开始抓
                .addUrl("http://www.jianshu.com")
                //开启5个线程抓取
                .thread(5)
                //启动爬虫
                .run();
    }
}
